Matplotlib is a standard, Python, 2D plotting library (https://matplotlib.org/)
Seaborn is also a Python, data visualization library built atop Matplotlib (https://seaborn.pydata.org/)
# rendering our plots inline (aka, in our Jupyter notebook) and changing the layout a bit
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
# installing all of our libraries
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# setting some more styling
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize': (20, 20)})
matplotlib.style.use(['seaborn-talk', 'seaborn-ticks'])
data = pd.read_csv("./NYPD_Crashes.csv",low_memory=False)
data.dtypes
data['DATETIME'] = data.DATE + ' ' + data.TIME # create a new field called 'datetime' that combines date and time
data.DATETIME = pd.to_datetime(data.DATETIME, format="%m/%d/%Y %H:%M") # format this new column as a datetime
data.TIME = pd.to_datetime(data.TIME, format="%H:%M")
data.DATE = pd.to_datetime(data.DATE, format="%m/%d/%Y")
# we'll also create two new columns, 'injury' and 'death'
data['INJURY'] = (data['NUMBER OF PERSONS INJURED']>0) # true if there's at least one injury, false if otherwise
data['DEATH'] = (data['NUMBER OF PERSONS KILLED']>0) # true if there's at least one death, false if otherwise
data.plot(kind='scatter', x='LONGITUDE', y='LATITUDE')
clean_mask = (data.LATITUDE > 40) & (data.LATITUDE < 41) & (data.LONGITUDE < -72) & (data.LONGITUDE > -74.5)
cleandf = data[clean_mask]
cleandf.plot(kind='scatter', x='LONGITUDE', y='LATITUDE')
cleandf.plot(kind='scatter', x='LONGITUDE', y='LATITUDE', figsize=(20, 15))
# sampling – remember, we can either specify the number of data points, or the percentage of the dataset that
# we want to keep.
# keep 10,000 data points
sample = cleandf.sample(n=10000)
sample.plot(kind='scatter', x='LONGITUDE', y='LATITUDE', figsize=(20, 15))
# keep 1% of the dataset
sample = cleandf.sample(frac=0.01)
sample.plot(kind='scatter', x='LONGITUDE', y='LATITUDE', figsize=(20, 15))
# altering the marker size:
cleandf.plot(kind='scatter', x='LONGITUDE', y='LATITUDE', figsize=(20, 15), s=0.5 )
# altering the marker transparency:
cleandf.plot(
kind='scatter',
x='LONGITUDE',
y='LATITUDE',
figsize=(20, 15),
s=0.5,
alpha=0.05)
cleandf.plot(
kind='hexbin',
x='LONGITUDE',
y='LATITUDE',
gridsize=100,
cmap=plt.cm.Blues,
figsize=(15, 12))
plt.subplots(figsize=(20, 15))
sample = cleandf.sample(10000) # take sample because density plots take a while to computer
sns.kdeplot(
sample.LONGITUDE,
sample.LATITUDE,
gridsize=100, # controls the resolution
cmap=plt.cm.rainbow, # color scheme
shade= # whether to have a density plot (True), or just the contours (False)
True,
alpha=0.5,
shade_lowest=False,
n_levels=50 # how many contours/levels to have
)
plt.subplots(figsize=(20, 15))
sample = cleandf.sample(10000)
sns.kdeplot(
sample.LONGITUDE,
sample.LATITUDE,
gridsize=100,
cmap=plt.cm.rainbow,
shade=False,
shade_lowest=False,
n_levels=25)
# imagine we want to combine the scatter plot with the contour plot above...
sample = cleandf.sample(10000)
scatterplot = cleandf.plot(
kind='scatter',
x='LONGITUDE',
y='LATITUDE',
figsize=(20, 15),
s=0.5,
alpha=0.1)
sns.kdeplot(
sample.LONGITUDE,
sample.LATITUDE,
gridsize=100,
cmap=plt.cm.rainbow,
shade=False,
shade_lowest=False,
n_levels=20,
alpha=1,
ax=scatterplot)
data = pd.read_csv("./NYPD_Crashes.csv",low_memory=False) # make sure we're reading our raw data
data.dropna(subset=["LATITUDE","LONGITUDE"],inplace=True)
data.head()
lat_long = data[["LATITUDE","LONGITUDE","DATE","TIME","BOROUGH","VEHICLE TYPE CODE 1"]]
lat_long.head()
test = lat_long[:100]
test
lat_list = list(test['LATITUDE'])
lon_list = list(test['LONGITUDE'])
date_list = list(test['DATE'])
time_list = list(test['TIME'])
borough_list = list(test['BOROUGH'])
vehicle_list = list(test['VEHICLE TYPE CODE 1'])
# https://docs.bokeh.org/en/latest/
from bokeh.io import output_file, show
from bokeh.models import *
map_options = GMapOptions(lat=40.7128, lng=-74.0060, map_type="roadmap", zoom=11)
plot = GMapPlot(x_range=Range1d(), y_range=Range1d(), map_options=map_options,api_key = "")
source = ColumnDataSource(
data = dict(
lat=lat_list,
lon=lon_list,
date = date_list,
time = time_list,
borough = borough_list,
vehicle = vehicle_list
))
circle = Circle(x="lon", y="lat", size=15, fill_color="blue", fill_alpha=0.8, line_color=None)
plot.add_glyph(source, circle)
plot.add_tools(PanTool(), WheelZoomTool(), BoxSelectTool(), BoxZoomTool())
plot.title.text="NYC Accidents"
plot.add_tools(HoverTool(
tooltips=[
( 'date', '@date' ),
( 'time', '@time' ),
( 'borough', '@borough' ),
( 'vehicle', '@vehicle' )
],
formatters={
'date' : 'datetime', # use 'datetime' formatter for 'date' field
'time' : 'printf',
'borough' : 'numeral',
'vehicle' : 'numeral'
},
mode='vline'
))
# output_file("gmap_plot.html")
show(plot)
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use(['seaborn-talk', 'seaborn-ticks', 'seaborn-whitegrid'])
import pandas as pd
import sqlite3
con = sqlite3.connect('citibikeDataForViz.db') # connect to our database
The crontab used is:
/1 * /Users/siegmanA/anaconda3/bin/python $(which python3) ~/Desktop/NYU-Projects-in-Programming-Fall-2019/(Class\ 7)\ Data\ Visualization/citibike_cron_script.py >> ~/Desktop/tmp/citiCron.log 2>&1
check = pd.read_sql("SELECT * FROM StationsData LIMIT 3", con=con)
check
df = pd.read_sql("""SELECT station_id,
stationName,
availableBikes,
availableDocks,
totalDocks,
latitude,
longitude,
lastCommunicationTime
FROM StationsData""", con=con)
df['lastCommunicationTime'] = pd.to_datetime(df['lastCommunicationTime'], format='%Y-%m-%d %H:%M:%S %p')
df.head()
len(df)
df.tail()
# let's see how 'full' each bike station is at a given time
df['percent_full'] = df['availableBikes']/df['totalDocks']
station_timeseries = df.pivot_table(
index='lastCommunicationTime',
values='percent_full',
aggfunc='mean'
).interpolate(method='pad')
station_timeseries.head(5)
df = df[df.lastCommunicationTime != '1969-12-31 07:00:00']
station_timeseries = df.pivot_table(
index='lastCommunicationTime',
values='percent_full',
aggfunc='mean'
).interpolate(method='time')
station_timeseries.tail()
# then, let's plot that over time
%matplotlib inline
station_timeseries.plot(alpha=.5, figsize=(18, 9), ylim=(0,1), xlim=('2019-10-10 06', '2019-10-10 06:30'))
Let's limit our plot to just two stations:
which are nearby and tend to exhibit similar behavior. Remember that the list of stations is available as a JSON
df[df.stationName.str.contains("Mercer") & df.stationName.str.contains("Bleecker") ].head()
df[df.stationName.str.contains("LaGuardia") ].head()
station_timeseries = df.pivot_table(
index='lastCommunicationTime',
columns='station_id',
values='percent_full',
aggfunc='mean'
).interpolate(method='time')
station_timeseries.tail()
station_timeseries[ [161, 3260] ].plot(
alpha=0.5,
legend=False,
figsize=(20,5),
xlim=('2019-10-10 06', '2019-10-10 06:30'),
ylim=(0,1)
)
For our next analysis, we are going to try to find bike stations that have similar behaviors over time. A very simple technique that we can use to find similar time series is to treat the time series as vectors, and compute their correlation. Pandas provides the corr function that can be used to calculate the correlation of columns. (If we want to compute the correlation of rows, we can just take the transpose of the dataframe using the transpose() function, and compute the correlations there.)
similarities = station_timeseries.corr(method='pearson')
similarities.head(5)
Let's see the similarities of the two stations that we examined above.
stations = [161, 3260]
similarities[stations].loc[stations]
# 393: E 5 St & Avenue C
# 2003: 1 Ave & E 18 St
stations = [393, 2003]
similarities[stations].loc[stations]
For bookkeeping purposes, we are going to drop stations that generate NaN values, as we cannot use such entries for our analysis.
# Number of stations with non-NaN similarity per station
check = similarities.count()
# Find the number of stations with less than the max number of similarities
todrop = check[check < check.max()].index.values
similarities.drop(todrop, axis='index', inplace=True)
similarities.drop(todrop, axis='columns', inplace=True)
Without explaining too much about clustering, we are going to use a clustering technique and cluster together bike stations that are "nearby" according to our similarity analysis. For this, we need to first convert our similarities to distance.
We are now going to convert our similarities into distance metrics. Our distance values will be always positive, and bounded between 0 and 1.
# similarity goes from -1 to 1, so 1-similarity goes from 0 to 2.
# so, we multiply with 0.5 to get it between 0 and 1, and then take the square
distances = ((.5*(1-similarities))**2)
distances.head(5)
The clustering code is very simple: The code below will create two groups of stations.
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
cluster = KMeans(n_clusters=2)
cluster.fit(distances.values)
We will now take the results of the clustering and associate each of the data points into a cluster.
labels = pd.DataFrame(list(zip(distances.index.values.tolist(), cluster.labels_)), columns = ["station_id", "cluster"])
labels
Let's see how many stations in each cluster
labels.pivot_table(
index = 'cluster',
aggfunc = 'count'
)
We will start by assining a color to each cluster, so that we can plot each station-timeline with the cluster color. (We put a long list of colors, so that we can play with the number of clusters in the earlier code, and still get nicely colored results.)
colors = list(['red','black', 'green', 'magenta', 'yellow', 'blue', 'white', 'cyan'])
labels['color'] = labels['cluster'].apply(lambda cluster_id : colors[cluster_id])
labels.head(10)
stations_plot = station_timeseries.plot(
alpha=0.5,
legend=False,
figsize=(20,5),
linewidth=1,
color=labels['color'],
xlim=('2019-10-10 06', '2019-10-10 06:30'),
ylim=(0,1)
)
The plot still looks messy. Let's try to plot instead a single line for each cluster. To represent the cluster, we are going to use the median fullness value across all stations that belong to a cluster, for each timestamp. For that, we can again use a pivot table: we define the communication_time as one dimension of the table, and cluster as the other dimension, and we use the median function.
For that, we first join our original dataframe with the results of the clustering, using the merge command, and add an extra column that includes the clusterid for each station. Then we compute the pivot table.
median_cluster = df.merge(
labels,
how='inner',
on='station_id'
).pivot_table(
index='lastCommunicationTime',
columns='cluster',
values='percent_full',
aggfunc='median'
)
median_cluster.head(15)
Now, we can plot the medians for the two clusters.
median_cluster.plot(
figsize=(20,5),
linewidth = 2,
alpha = 0.75,
color=colors,
ylim = (0,1),
xlim=('2019-10-10 06', '2019-10-10 06:05'),
grid = True
)
And just for fun and for visual decoration, let's put the two plots together. We are going to fade a lot the individual station time series (by putting the alpha=0.005) and we are going to make more prominent the median lines by increasing their linewidths. We will limit our plot to one week's worth of data:
stations_plot = station_timeseries.plot(
alpha=0.005,
legend=False,
figsize=(20,5),
color=labels["color"]
)
median_cluster.plot(
figsize=(20,5),
linewidth = 3,
alpha = 0.5,
color=colors,
xlim=('2019-10-10 06', '2019-10-10 06:05'),
ylim=(0,1),
ax = stations_plot
)